{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e1096748-2cbb-4bf1-aa28-fd18dda97022",
   "metadata": {},
   "source": [
    "# Data Sampling and Shuffling"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ecdba48e-dbeb-4fd3-901f-88e71aec70d1",
   "metadata": {},
   "source": [
    "### Data Splitting without Random Seed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e569e619-5e76-492a-b110-0e3b36cb9149",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "x_toydata = np.array([1., 2., 3., 4., 5., 6., 7., 8., 9.])\n",
    "y_labels = np.array([ 0,  1,  0,  1, 1,  0,  0,  1,  0 ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c85040c1-81e3-4c08-9f07-644cd74cca8b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X_train [9. 2. 4. 6. 1. 5.]\n",
      "X_test [7. 3. 8.]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    x_toydata, y_labels, test_size=0.3, shuffle=True, stratify=y_labels\n",
    ")\n",
    "\n",
    "print(\"X_train\", X_train)\n",
    "print(\"X_test\", X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "8631e484-791a-4adf-857a-2a0529a6bc1f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X_train [5. 4. 6. 3. 1. 2.]\n",
      "X_test [7. 8. 9.]\n"
     ]
    }
   ],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    x_toydata, y_labels, test_size=0.3, shuffle=True, stratify=y_labels\n",
    ")\n",
    "\n",
    "print(\"X_train\", X_train)\n",
    "print(\"X_test\", X_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3e8ecec5-b9d8-487c-872b-34537ab7c441",
   "metadata": {},
   "source": [
    "### Data Splitting with Random Seed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "d49b4ee6-b4ff-4d5b-adb4-3fc541fb0b9a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X_train [9. 2. 8. 3. 5. 7.]\n",
      "X_test [4. 1. 6.]\n"
     ]
    }
   ],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    x_toydata, y_labels, test_size=0.3, shuffle=True, stratify=y_labels,\n",
    "    random_state=123\n",
    ")\n",
    "\n",
    "print(\"X_train\", X_train)\n",
    "print(\"X_test\", X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "e2a87370-de68-4c92-9d99-3faed33657f4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "X_train [9. 2. 8. 3. 5. 7.]\n",
      "X_test [4. 1. 6.]\n"
     ]
    }
   ],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    x_toydata, y_labels, test_size=0.3, shuffle=True, stratify=y_labels,\n",
    "    random_state=123\n",
    ")\n",
    "\n",
    "print(\"X_train\", X_train)\n",
    "print(\"X_test\", X_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2a5650fd-30b7-4d69-bbce-77f2c36ad272",
   "metadata": {},
   "source": [
    "### K-fold without Random Seed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "9e234e41-2ac8-4aa9-b165-c30ee1a12448",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Feature values [2. 3. 4. 5. 6. 7.]\n",
      "Feature values [1. 2. 4. 7. 8. 9.]\n",
      "Feature values [1. 3. 5. 6. 8. 9.]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import StratifiedKFold\n",
    "\n",
    "cv = StratifiedKFold(n_splits=3, shuffle=True)\n",
    "\n",
    "for train_idx, valid_idx in cv.split(x_toydata, y_labels):\n",
    "    print(\"Feature values\", x_toydata[train_idx])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "ba10c301-3346-4c21-913e-4c240d932084",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Feature values [2. 3. 4. 7. 8. 9.]\n",
      "Feature values [1. 2. 4. 5. 6. 9.]\n",
      "Feature values [1. 3. 5. 6. 7. 8.]\n"
     ]
    }
   ],
   "source": [
    "for train_idx, valid_idx in cv.split(x_toydata, y_labels):\n",
    "    print(\"Feature values\", x_toydata[train_idx])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1961da6d-50ea-4247-82d8-c0ee3f592edd",
   "metadata": {},
   "source": [
    "### K-fold with Random Seed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "4f0390cb-ff36-4441-968e-cf1e2bde21ac",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Feature values [3. 4. 5. 6. 8. 9.]\n",
      "Feature values [1. 2. 4. 5. 6. 7.]\n",
      "Feature values [1. 2. 3. 7. 8. 9.]\n"
     ]
    }
   ],
   "source": [
    "cv = StratifiedKFold(n_splits=3, random_state=123, shuffle=True)\n",
    "\n",
    "for train_idx, valid_idx in cv.split(x_toydata, y_labels):\n",
    "    print(\"Feature values\", x_toydata[train_idx])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "dd766cf8-c818-4f43-9b4c-a0d51ef8ec21",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Feature values [3. 4. 5. 6. 8. 9.]\n",
      "Feature values [1. 2. 4. 5. 6. 7.]\n",
      "Feature values [1. 2. 3. 7. 8. 9.]\n"
     ]
    }
   ],
   "source": [
    "cv = StratifiedKFold(n_splits=3, random_state=123, shuffle=True)\n",
    "\n",
    "for train_idx, valid_idx in cv.split(x_toydata, y_labels):\n",
    "    print(\"Feature values\", x_toydata[train_idx])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6c9b9986",
   "metadata": {},
   "source": [
    "## Dataset Loading without Random Seed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "82de5768",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([7, 6, 2, 5, 3, 8, 8, 6])\n"
     ]
    }
   ],
   "source": [
    "from torchvision import datasets, transforms\n",
    "from torch.utils.data import DataLoader\n",
    "\n",
    "\n",
    "train_dataset = datasets.MNIST(root='./data', train=True, transform=transforms.ToTensor(), download=True)\n",
    "train_loader = DataLoader(dataset=train_dataset, batch_size=8, shuffle=True)\n",
    "\n",
    "for inputs, labels in train_loader:\n",
    "    pass\n",
    "print(labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8def12a3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([4, 6, 5, 2, 5, 2, 9, 3])\n"
     ]
    }
   ],
   "source": [
    "train_loader = DataLoader(dataset=train_dataset, batch_size=8, shuffle=True)\n",
    "\n",
    "for inputs, labels in train_loader:\n",
    "    pass\n",
    "print(labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "18d33c0c",
   "metadata": {},
   "source": [
    "## Dataset Loading with Random Seed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "30627a3d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([1, 8, 8, 7, 2, 5, 4, 1])\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "torch.manual_seed(123)\n",
    "train_loader = DataLoader(dataset=train_dataset, batch_size=8, shuffle=True)\n",
    "\n",
    "for inputs, labels in train_loader:\n",
    "    pass\n",
    "print(labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e346084e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([1, 8, 8, 7, 2, 5, 4, 1])\n"
     ]
    }
   ],
   "source": [
    "torch.manual_seed(123)\n",
    "train_loader = DataLoader(dataset=train_dataset, batch_size=8, shuffle=True)\n",
    "\n",
    "for inputs, labels in train_loader:\n",
    "    pass\n",
    "print(labels)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
